Showcase

library(here)
source(here("setup.R"))
options(encoding = "UTF-8")
Sys.setlocale("LC_ALL", "C")
here() starts at /Users/stefan/workspace/work/phd/thesis
'C/C/C/C/C/de_DE'
full_truth_county <- read_csv(here("data/processed/RKI_county_weekly.csv")) %>%
    arrange(date, ags)

full_truth <- full_truth_county %>%
    group_by(date) %>%
    summarize(cases = sum(cases, na.rm = TRUE))

ags_county_dict <- read_csv(here("data/processed/ags_county_dict.csv")) %>%
    arrange(ags) %>%
    mutate(c = 1:n())
population <- read_csv(here('data/processed/population.csv'))
showcase_results <- read_csv(here("data/results/4_local_outbreak_model/showcase.csv"))
y_total_result <- showcase_results %>% 
    filter(variable == "y_total")
rename_pct_to_decimal <- function(x) as.numeric(gsub("%", "", x)) / 100

y_total_dist <- y_total_result %>% 
    select(ends_with('%')) %>%
    pivot_longer(cols = everything(), names_to = "percentile", values_to = "value") %>%
    mutate(percentile = rename_pct_to_decimal(percentile)) %>%
    bind_rows(tibble(value = c(0, Inf), percentile = c(0, 1))) %>%
    arrange(value)

Comparison to NBinom baseline

prediction_date <- showcase_results %>% pull(date) %>% max
baseline_df <- full_truth %>%
    filter(date <= prediction_date - 7, date >= prediction_date - 14) 

growth_rate <- baseline_df %>%
    pull(cases) %>%
    log %>% diff %>% exp


lambda_old <- baseline_df %>%
    pull(cases) %>%
    tail(1)

lambda_new <- lambda_old * growth_rate

growth_rate
lambda_old
lambda_new
1.66609294320138
3872
6451.11187607574
baseline_county <- full_truth_county %>% filter(date <= prediction_date - 7, date >= prediction_date - 14)
I_old_county <- baseline_county %>%
    group_by(ags) %>%
    summarize(cases = cases[2]) 

emp_rho_c <- baseline_county %>%
    group_by(ags) %>%
    summarize(growth_rate = exp(diff(log(cases)))) 
rho_c_cleaned <- emp_rho_c %>%
    mutate(growth_rate = ifelse(is.na(growth_rate) | is.infinite(growth_rate), 1.0, growth_rate)) %>%
    mutate(growth_rate = pmin(growth_rate, 3))

lambda_new_county <-  rho_c_cleaned %>%
    inner_join(I_old_county) %>%
    mutate(intensity_prediction = growth_rate * cases) %>%
    pull(intensity_prediction)
Joining with `by = join_by(ags)`
total_nb_obs <- full_truth %>%
    filter(date >= ymd('2020-04-25'), date<= prediction_date - 7) %>%
    mutate(rho = cases / lag(cases)) %>%
    mutate(predicted = lag(cases) * lag(rho)) %>%
    tail(-2) %>%
    dplyr::select(y = cases, mu = predicted)

est_r <- function(df) {
    loglik <- function(r) -sum(dnbinom(df$y, mu = df$mu, size = r))
    r_est <- optimize(loglik, interval=c(1e-3,1e3))$minimum
}

r_country <- est_r(total_nb_obs)
nb_quantiles <- qnbinom(y_total_dist$percentile, mu = lambda_new, size = r_country) %>%
    tibble(
        percentile = y_total_dist$percentile,
        value = .
    ) %>%
    pivot_wider(names_from = percentile, values_from = value) %>%
    mutate(date = prediction_date)
nb_quantiles

r_country
A tibble: 1 × 26
0 0.01 0.025 0.05 0.1 0.15 0.2 0.25 0.3 0.35 0.7 0.75 0.8 0.85 0.9 0.95 0.975 0.99 1 date
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <date>
0 5250 5426 5581 5763 5888 5988 6076 6155 6228 6726 6810 6905 7017 7159 7373 7562 7786 Inf 2020-06-27
143.094444769947
lambda_new_county[99]
2568
N_samples <- 1e5
total_county_nb_obs <- full_truth_county %>%
    filter(date >= ymd('2020-04-25'), date<= prediction_date - 7) %>%
    group_by(ags) %>%
    mutate(rho = cases / lag(pmax(cases, 1))) %>%
    mutate(predicted = pmax(lag(cases) * lag(rho), 1)) %>%
    ungroup() %>%
    filter(!is.na(predicted)) %>%
    dplyr::select(ags, y = cases, mu = predicted)

county_rs <- total_county_nb_obs %>%
    group_by(ags) %>%
    summarize(r = est_r(cur_data())) %>%
    mutate(mu = lambda_new_county)

nbinom_county_samples <- rnbinom(N_samples * 400, mu = county_rs$mu, size = county_rs$r) %>%
    matrix(c(400, N_samples)) 

nb_county_quantiles <-  nbinom_county_samples %>%
    pmin(population$population) %>%
    colSums %>%
    quantile(probs = y_total_dist$percentile) %>%
    tibble(
        percentile = y_total_dist$percentile,
        value = .
    ) %>%
    pivot_wider(names_from = percentile, values_from = value) %>%
    mutate(date = prediction_date)
nb_county_quantiles_no_GL <- nbinom_county_samples[-99,] %>%
    colSums %>%
    quantile(probs = y_total_dist$percentile) %>%
    tibble(
        percentile = y_total_dist$percentile,
        value = .
    ) %>%
    pivot_wider(names_from = percentile, values_from = value) %>%
    mutate(date = prediction_date)
Warning message:

"There was 1 warning in `summarize()`.

i In argument: `r = est_r(cur_data())`.

i In group 1: `ags = "01001"`.

Caused by warning:

! `cur_data()` was deprecated in dplyr 1.1.0.

i Please use `pick()` instead."
nb_county_quantiles
A tibble: 1 × 26
0 0.01 0.025 0.05 0.1 0.15 0.2 0.25 0.3 0.35 0.7 0.75 0.8 0.85 0.9 0.95 0.975 0.99 1 date
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <date>
4759 5611.99 5829 6035 6282 6465 6619 6759 6888 7011 7957 8139 8358 8631 9018 9778 11061.02 16065.13 249228 2020-06-27

tikz/regional_showcase_predictions.tex

y_total_result
A spec_tbl_df: 1 × 29
variable c t mean sd 1.0 % 2.5 % 5.0 % 10.0 % 15.0 % 65.0 % 70.0 % 75.0 % 80.0 % 85.0 % 90.0 % 95.0 % 97.5 % 99.0 % date
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <date>
y_total 0 10 4170.382 666.6667 2261.53 2842.372 3605.151 3605.479 3605.806 4721.306 4721.794 4722.282 4722.769 4723.257 4723.745 5012.9 5277.69 5281.091 2020-06-27
dx <- .4

alpha_fct <- factor(c("median", "$50 \\%$ PI", "$95 \\%$ PI"), levels=c("median", "$50 \\%$ PI", "$95 \\%$ PI"))
col = pal_npg("nrc")(10)[3]


geom_prediction_interval <- function(model, data, dx=1, offset=0) {
    list(
        geom_rect(aes(xmin = date - dx + offset, xmax= date+dx + offset, ymin = `0.025`, ymax = `0.975`, alpha=alpha_fct[3], fill=model), data=data),
        geom_rect(aes(xmin = date - dx + offset, xmax= date+dx + offset, ymin = `0.25`, ymax = `0.75`, alpha=alpha_fct[2], fill=model), data=data),
        geom_rect(aes(xmin= date - dx + offset, xmax=date + dx + offset, ymin = `0.5`-30, ymax=`0.5`+30, alpha=alpha_fct[1], fill=model), data=data)
    )
}

y_result_plot <- y_total_result %>%
    rename_with(function(x) as.character(rename_pct_to_decimal(x)), ends_with('%')) 

full_truth %>%
    inner_join(y_result_plot, by=c("date"="date")) %>%
    ggplot(aes()) +
    geom_prediction_interval("ssm", data=y_result_plot, dx=dx, offset=-2*dx) +
    geom_prediction_interval("nbinom", data=nb_quantiles, dx=dx,offset=-2*dx) +
    geom_prediction_interval("reg. nbinom", data=nb_county_quantiles, dx=dx, offset=0*dx) +
    geom_prediction_interval("reg. nbinom, no GL", data=nb_county_quantiles_no_GL, dx=dx, offset=2*dx) +

    geom_line(aes(date, cases), data= full_truth %>% filter(date >= ymd('2020-04-25'), date <= ymd('2020-06-27'))) +
    geom_point(aes(date, cases), data= full_truth %>% filter(date >= ymd('2020-04-25'), date <= ymd('2020-06-27')), size=5)  +
    scale_alpha_manual(name="", values=c("median"=1, "$50 \\%$ PI"=0.5, "$95 \\%$ PI"=0.3)) +
    labs(x="", y="$I_{t, \\cdot}$", fill="") +
    scale_x_date(
        breaks = function(x) seq(ceiling_date(x[1], "week", week_start = 6), floor_date(x[2], "week", week_start = 6), by = "2 week"),
        date_labels = "%d %b %y",
        expand = expansion(mult = c(0.01, 0.01))
    ) +
    theme(
        panel.grid.major.x = element_line(linewidth = 1.2),
        panel.grid.minor.x = element_line(linewidth = 0.5)
    ) +
    scale_fill_npg()

ggsave_tikz(here("tikz/regional_showcase_prediction.tex"), width=16/1.5, height=7/1.5)
pdf: 2

full_truth %>%
    filter(date <= prediction_date, date >= prediction_date - 21)
A tibble: 4 × 2
date cases
<date> <dbl>
2020-06-06 2454
2020-06-13 2324
2020-06-20 3872
2020-06-27 3433

tables/regional_showcase_theta.tex

parameters <- read_csv(here("data/results/4_local_outbreak_model/estimated_parameters.csv"), col_names = c("parameter", "value")) %>% 
    pivot_wider(names_from = parameter, values_from = value) %>%
    rename(
        "$\\alpha$" = alpha,
        "$\\sigma_{\\overline{\\log \\rho}}$" = sigma_r,
        "$\\sigma_{S}$" = "sigma spatial",
        "$\\bar{q}$" = q,
        "$C$" = C,
        "$r$" = r
    ) %>%
    pivot_longer(everything(), names_to="parameter", values_to="estimate")

parameters %>%
    kable(format = "latex", digits=3, booktabs=T, escape=F) %>%
    cat(., file=here("tables/regional_showcase_theta.tex"))
log_rhos <- showcase_results %>%
    filter(str_starts(variable, "log_rho_")) %>%
    # extract both indices from variable names like log_rho_2_1
    mutate(c = as.numeric(str_extract(variable, "log_rho_(\\d+)_(\\d+)", group=1))) %>%
    mutate(t = as.numeric(str_extract(variable, "log_rho_(\\d+)_(\\d+)", group=2))) 
log_rhos %>%
    head()

log_rhos %>%
    mutate(c = factor(c)) %>%
    ggplot(aes(t, exp(mean), group=c)) +
    geom_line(show.legend=FALSE, alpha=1/100) +
    xlim(NA, 9) +
    scale_y_log10()
A tibble: 6 × 29
variable c t mean sd 1.0 % 2.5 % 5.0 % 10.0 % 15.0 % 65.0 % 70.0 % 75.0 % 80.0 % 85.0 % 90.0 % 95.0 % 97.5 % 99.0 % date
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <date>
log_rho_1_1 1 1 -1.2163769 0.1598178 -1.68070449 -1.38564169 -1.38209844 -1.23059352 -1.2302142 -1.2264477 -1.2263806 -1.2263136 -1.2262465 -1.2261794 -1.2261123 -0.9170889 -0.6766719 -0.6762008 2020-04-25
log_rho_2_1 2 1 0.5549448 0.2022438 0.01482255 0.01658301 0.01830238 0.07906171 0.6228041 0.6312126 0.6315221 0.6318316 0.6321411 0.6324505 0.6327600 0.6330695 0.6332242 0.6333171 2020-04-25
log_rho_3_1 3 1 -1.2193435 0.1411181 -1.73760412 -1.73510571 -1.73333119 -1.39045531 -1.1816277 -1.1714910 -1.1714093 -1.1713275 -1.1712457 -1.1711639 -1.1710822 -1.1710004 -1.1709595 -1.1709350 2020-04-25
log_rho_4_1 4 1 0.3627378 0.1114700 -0.40008267 0.28558075 0.28945849 0.37452239 0.3799041 0.3847352 0.3848608 0.3849863 0.3851119 0.3852374 0.3853630 0.3854885 0.3855513 0.3855889 2020-04-25
log_rho_5_1 5 1 -1.6572183 0.5181756 -3.50727209 -3.49655451 -3.47869189 -2.22266098 -1.5013308 -1.4963833 -1.4962190 -1.4960547 -1.4958903 -1.4957260 -1.4955616 -1.4953973 -1.4953151 -0.9410035 2020-04-25
log_rho_6_1 6 1 -0.2124819 0.1065888 -0.54829519 -0.54787938 -0.54719614 -0.36253192 -0.1835588 -0.1812240 -0.1811594 -0.1810947 -0.1810301 -0.1809654 -0.1809007 -0.1808361 -0.1808037 -0.1807844 2020-04-25
Warning message:

"Removed 400 rows containing missing values or values outside the scale range

(`geom_line()`)."

approaches <- factor(c("ssm", "empirical"), levels = c("ssm", "empirical"))

ssm_app <- approaches[1]
empirical_app <- approaches[2]

emp_rho_plt <- full_truth_county %>%
    filter(date >= ymd('2020-04-18'), date<= prediction_date - 7) %>%
    group_by(ags) %>% 
    mutate(rho = cases / lag(pmax(cases, 1))) %>%
    filter(!is.na(rho)) 

showcase_results %>%
    filter(variable == "log_rho") %>%
    ggplot(aes(date, exp(`50.0 %`), color=ssm_app)) +
    geom_boxplot(aes(date-1, rho, color=empirical_app, group = date, linetype='county'), data = emp_rho_plt, width=2) +
    geom_boxplot(aes(date+1, group = date, color=ssm_app, linetype='county'), data = log_rhos, width=2) +
    geom_line(aes(linetype='country'), linewidth=2) +
    geom_line(data = full_truth %>% mutate(rho = cases / lag(cases)), aes(date, rho, color=empirical_app, linetype='country'), linewidth=2) +
    ylim(0,4) +
    scale_x_date(
        breaks = function(x) seq(ceiling_date(x[1], "week", week_start = 6), floor_date(x[2], "week", week_start = 6), by = "2 week"),
        date_labels = "%d %b %y",
        expand = expansion(mult = c(0.01, 0.01)),
        limits = c(ymd('2020-04-20'), ymd('2020-06-24'))
    ) +
    labs(color="approach", linetype='level', x="", y="$\\rho_{t}$") +
    scale_linetype_manual(values = c('county'='solid', 'country'='dashed')) 

ggsave_tikz(here("tikz/regional_showcase_rho.tex"), width=16/1.5, height=7/1.5)
Warning message:

"Removed 97 rows containing non-finite outside the scale range

(`stat_boxplot()`)."

Warning message:

"Removed 400 rows containing missing values or values outside the scale range

(`stat_boxplot()`)."

Warning message:

"Removed 49 rows containing non-finite outside the scale range

(`stat_boxplot()`)."

Warning message:

"Removed 1 row containing missing values or values outside the scale range

(`geom_line()`)."

Warning message:

"Removed 166 rows containing missing values or values outside the scale range

(`geom_line()`)."

Warning message:

"Removed 97 rows containing non-finite outside the scale range

(`stat_boxplot()`)."

Warning message:

"Removed 400 rows containing missing values or values outside the scale range

(`stat_boxplot()`)."

Warning message:

"Removed 49 rows containing non-finite outside the scale range

(`stat_boxplot()`)."

Warning message:

"Removed 1 row containing missing values or values outside the scale range

(`geom_line()`)."

Warning message:

"Removed 166 rows containing missing values or values outside the scale range

(`geom_line()`)."
pdf: 2

log_rhos %>% 
    filter(date == prediction_date - 7) %>%
    inner_join(full_truth_county %>% filter(date == prediction_date - 7) %>% mutate(c = 1:400) %>% select(c, cases)) %>%
    select(`50.0 %`, cases)  %>%
    mutate(mu = exp(`50.0 %`) * cases)  %>%
    filter(cases > 19)
Joining with `by = join_by(c)`
A tibble: 32 × 3
50.0 % cases mu
<dbl> <dbl> <dbl>
0.22141410 27 33.69168
0.13619816 27 30.93954
0.81227614 135 304.15910
0.16651360 77 90.95083
-0.19824822 28 22.96465
-0.46848131 43 26.91594
0.42210002 96 146.41546
0.21066202 82 101.22859
0.26134013 28 36.36274
0.51631338 23 38.54428
1.30123256 29 106.54084
-0.37364730 24 16.51727
0.09001093 38 41.57908
0.82648694 24 54.84663
0.56741722 31 54.67488
2.46741720 193 2275.84658
2.48864543 856 10310.47665
0.38797962 54 79.59599
1.17074579 26 83.83431
1.32054362 29 108.61825
0.88694198 37 89.82469
-0.04494725 52 49.71449
1.00240337 31 84.46950
0.62712586 23 43.06110
0.02060513 27 27.56211
-0.56278372 22 12.53167
0.90301542 21 51.80765
0.26862714 57 74.56554
0.17735707 22 26.26926
0.74328202 536 1127.11458
3.21901521 27 675.09409
0.96394301 84 220.24924
log_rhos %>% 
    filter(date == prediction_date - 7) %>%
    ggplot(aes(exp(`50.0 %`))) +
    geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

showcase_results %>%
    filter(str_starts(variable, "y_total_")) %>%
    mutate(c = as.numeric(str_extract(variable, "y_total_(\\d+)", group=1))) %>%
    inner_join(mutate(ags_county_dict, c=1:400), by = "c") %>%
    inner_join(filter(full_truth_county, date == '2020-06-13'), by=c("ags")) %>%
    filter(mean >= 100) %>%
    arrange(desc(mean)) %>%
    ggplot(aes(y = name, x = mean)) +
    geom_errorbarh(aes(xmin = `2.5 %`, xmax = `97.5 %`)) +
    geom_point(aes(x = `50.0 %`), pch='x', color='red', size=10) +
    geom_point(aes(x = cases))

Exchange matrix P

ags_county_dict <- read_csv(here("data/processed/ags_county_dict.csv"))
ags_state_dict <- read_csv(here("data/processed/ags_state_dict.csv"))

index_to_state <- ags_county_dict %>% 
    mutate(i = 1:400) %>%
    mutate(state_ags = str_sub(ags, end=2)) %>%
    rename(county_name = name) %>%
    inner_join(ags_state_dict, by=c("state_ags"="ags"))  %>%
    select(i, state_ags, county_name, state_name = name)
P <- read_delim(here("data/results/4_local_outbreak_model/showcase_P_matrix.csv"), delim = " ", col_names = 1:400) %>%
    as.matrix()


colnames(P) <- NULL
P_long_states <- melt(P) %>%
    rename(i = Var1, j = Var2) %>%
    inner_join(select(index_to_state, i, state_name_i = state_name), by=c("i")) %>%
    inner_join(select(index_to_state, j=i, state_name_j = state_name), by=c("j")) 
    
P_long_states %>%
    filter(state_name_i == state_name_j) %>%
    ggplot(aes(i,-j, fill=value)) +
    geom_tile() +
    scale_fill_gradient2(
        low = "white",
        mid = pal_npg("nrc")(5)[5],
        high = pal_npg("nrc")(5)[1],
        midpoint = 0.55,  # Set midpoint where you want emphasis
        limits = c(0,1)
    ) +
    facet_wrap(~state_name_i, scales="free") +
    theme_void() +
    labs(fill= "$p_{r,r'}$")


ggsave_tikz(here("tikz/P_matrix.tex"))
pdf: 2

tibble(
    total = colSums(P),
    i = 1:400,
    p_ii = diag(P)
) %>%
    inner_join(index_to_state, by=c("i")) %>%
    filter(total < 0.8) %>%
    ggplot(aes(x = reorder(county_name, -total), y = total, fill=state_name)) +
    geom_bar(stat = "identity") +
    coord_flip()

tibble(
    p_ii = diag(P),
    i = 1:400
) %>%
    inner_join(index_to_state, by=c("i")) %>%
    ggplot(aes(x = p_ii)) +
    geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# solve utf-8 encoding issues
Sys.setlocale("LC_ALL", "UTF-8")
'C/UTF-8/C/C/C/de_DE'
county_shorthand <- function(county_name) {
    ifelse(
        str_detect(county_name, "Stadt"),
        paste0(str_sub(county_name, 1, 3), ". Stadt"),
        paste0(str_sub(county_name, 1, 3), ".")
    )
}
saxony_ags <- ags_state_dict %>%
    filter(name == "Sachsen") %>%
    pull(ags)

saxony_counties <- ags_county_dict %>%
    mutate(i = 1:400) %>%
    filter(str_starts(ags, saxony_ags)) %>%
    rename(county_name = name) %>%
    select(i, county_name)


P_saxony <- melt(P) %>%
    rename(i = Var1, j = Var2) %>%
    inner_join(select(saxony_counties, i, county_name_i = county_name), by=c("i")) %>%
    inner_join(select(saxony_counties, j=i, county_name_j = county_name), by=c("j"))  



p_rr <- P_saxony %>%
    filter(i == j) %>%
    mutate(county_name_i_short = county_shorthand(county_name_i)) 


plt_saxony <- P_saxony %>%
    #mutate(value = ifelse(value < 0.01, NA, value)) %>%
    mutate(value = ifelse(i==j, NA, value)) %>%
    mutate(county_name_i_short = county_shorthand(county_name_i))  %>%
    ggplot() +
    geom_tile(aes(county_name_i_short,county_name_j, fill=value)) +
    scale_fill_gradient(
        low = "white",
        high = pal_npg("nrc")(10)[5],
        na.value = "white",
        limits = c(0, 0.16)
    ) +
    geom_tile(aes(county_name_i_short,county_name_j), fill=pal_npg("nrc")(2)[2], data=p_rr) +
    scale_y_discrete(limits=rev) +
    theme_minimal() +
    labs(fill= "$p_{r,r'}$", x="r", y="r'") +
    theme(
        axis.text.x = element_text(angle=90, hjust=1)
    ) +
    coord_fixed()


plt_marg_right <- melt(P) %>%
    rename(i = Var1, j = Var2) %>%
    inner_join(select(saxony_counties, j=i, county_name_j = county_name), by=c("j"))  %>%
    group_by(county_name_j) %>%
    summarize(value = sum(value)) %>%
    ggplot(aes(county_name_j, value)) +
    geom_bar(stat="identity", aes(fill = "$\\sum_{r\\neq r'} p_{r,r'}$")) +
    geom_bar(data=p_rr, aes(county_name_j, value, fill="$p_{r',r'}$"), stat="identity")+
    theme_minimal() +
    scale_x_discrete(limits=rev)+
    scale_y_continuous(breaks = seq(0,1.5, .2)) +
    coord_flip() +
    labs(x="", y="$\\sum\\limits_{r=1}^R p_{r,r'}$", fill = "")+
    theme(
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.title.y = element_blank()
    ) +
    scale_fill_npg()

((plt_saxony | plt_marg_right)) + plot_layout(guides = "collect", axes = "collect_y")

ggsave_tikz(here('tikz/P_matrix_saxony.tex'), width=16/1.5, height=9/1.5)
Warning message in (function (texString, cex = 1, face = 1, engine = getOption("tikzDefaultEngine"), :
“Attempting to calculate the width of a Unicode stringusing the pdftex engine. This may fail! See the Unicodesection of ?tikzDevice for more information.”
Warning message in (function (texString, cex = 1, face = 1, engine = getOption("tikzDefaultEngine"), :
“Attempting to calculate the width of a Unicode stringusing the pdftex engine. This may fail! See the Unicodesection of ?tikzDevice for more information.”
Warning message in (function (texString, cex = 1, face = 1, engine = getOption("tikzDefaultEngine"), :
“Attempting to calculate the width of a Unicode stringusing the pdftex engine. This may fail! See the Unicodesection of ?tikzDevice for more information.”
Warning message in (function (texString, cex = 1, face = 1, engine = getOption("tikzDefaultEngine"), :
“Attempting to calculate the width of a Unicode stringusing the pdftex engine. This may fail! See the Unicodesection of ?tikzDevice for more information.”
Warning message in (function (texString, cex = 1, face = 1, engine = getOption("tikzDefaultEngine"), :
“Attempting to calculate the width of a Unicode stringusing the pdftex engine. This may fail! See the Unicodesection of ?tikzDevice for more information.”
Warning message in (function (texString, cex = 1, face = 1, engine = getOption("tikzDefaultEngine"), :
“Attempting to calculate the width of a Unicode stringusing the pdftex engine. This may fail! See the Unicodesection of ?tikzDevice for more information.”
Warning message in (function (texString, cex = 1, face = 1, engine = getOption("tikzDefaultEngine"), :
“Attempting to calculate the width of a Unicode stringusing the pdftex engine. This may fail! See the Unicodesection of ?tikzDevice for more information.”
Warning message in (function (texString, cex = 1, face = 1, engine = getOption("tikzDefaultEngine"), :
“Attempting to calculate the width of a Unicode stringusing the pdftex engine. This may fail! See the Unicodesection of ?tikzDevice for more information.”
Warning message in (function (texString, cex = 1, face = 1, engine = getOption("tikzDefaultEngine"), :
“Attempting to calculate the width of a Unicode stringusing the pdftex engine. This may fail! See the Unicodesection of ?tikzDevice for more information.”
Warning message in (function (texString, cex = 1, face = 1, engine = getOption("tikzDefaultEngine"), :
“Attempting to calculate the width of a Unicode stringusing the pdftex engine. This may fail! See the Unicodesection of ?tikzDevice for more information.”
pdf: 2

Gütersloh situation

emp_rho_c %>%
    filter(1:n() == 99)
full_truth_county %>%
    filter(date == prediction_date - 7) %>%
    filter(1:n() == 99)
A tibble: 1 × 2
ags growth_rate
<chr> <dbl>
05754 11.72603
A spec_tbl_df: 1 × 4
date ags cases deaths
<date> <chr> <dbl> <dbl>
2020-06-20 05754 856 0

county-level WIS?

qs <- y_total_dist$percentile[c(-1, -length(y_total_dist$percentile))]

wis_county <- nbinom_county_samples %>%
    apply(1, quantile, probs = qs)  %>%
    rbind(
        full_truth_county %>%
            filter(date == prediction_date) %>%
            arrange(ags) %>%
            pull(cases),
        .
    ) %>%
    apply(2, function(x) WIS_decompose(qs, x[-1], x[1])$wis)

mean(wis_county)

showcase_results %>%
    filter(str_starts(variable, "y_total_")) %>%
    mutate(c = as.numeric(str_extract(variable, "y_total_(\\d+)", group=1))) %>%
    inner_join(mutate(ags_county_dict, c=1:400), by = "c") %>%
    inner_join(filter(full_truth_county, date == prediction_date), by=c("ags")) %>%
    rowwise() %>%
    mutate(
        wis = WIS_decompose(
            prob=qs,
            quant=c(`1.0 %`, `2.5 %`, `5.0 %`, `10.0 %`, `15.0 %`, `20.0 %`, `25.0 %`, `30.0 %`, `35.0 %`, `40.0 %`, `45.0 %`, `50.0 %`, `55.0 %`, `60.0 %`, `65.0 %`, `70.0 %`, `75.0 %`, `80.0 %`, `85.0 %`, `90.0 %`, `95.0 %`, `97.5 %`, `99.0 %`),
            actual= cases
           )$wis
    ) %>%
    pull(wis) %>%
    mean()
10.9725215326087
4.68710761599166

Comparison to ECDC forecasts

forecasts_directory <- here("data/results/4_local_outbreak_model")

df_forecasts <- list.files(forecasts_directory, pattern="forecasts_.*.csv", full.names = TRUE)  %>%
    # infer the forecast date from the filename - have to add 10 weeks to the date in the filename as it is the initial date
    lapply(function(file) read_csv(file) %>% mutate(date = ymd(str_extract(string = file, pattern = "\\d{4}-\\d{2}-\\d{2}")) + days(10 * 7))) %>% 
    bind_rows(.)
df_baseline <- read_csv(here("data/processed/KIT_FCH_KIT-baseline.csv")) %>%
    filter(date <= max(df_forecasts$date), date >= min(df_forecasts$date))
df_itww <- read_csv(here("data/processed/KIT_FCH_ITWW-county_repro.csv")) %>%
    filter(date <= max(df_forecasts$date), date >= min(df_forecasts$date))
df_ensemble_kit <- read_csv(here("data/processed/KIT_FCH_KITCOVIDhub-median_ensemble.csv")) %>%
    filter(date <= max(df_forecasts$date), date >= min(df_forecasts$date))
df_ensemble_ecdc <- read_csv(here("data/processed/ECDC_FCH_EuroCOVIDhub-ensemble.csv")) %>%
    filter(date <= max(df_forecasts$date), date >= min(df_forecasts$date))
last_date_KIT <- max(df_ensemble_kit$date)

df_ensemble <- df_ensemble_kit %>%
    rbind(filter(df_ensemble_ecdc, date > last_date_KIT))
all_df <- rbind(
    df_forecasts %>% filter(variable == 'y_total') %>% select(df_baseline %>% names) %>% mutate(model='ssm'),
    df_baseline %>% mutate(model='baseline'),
    df_itww %>% mutate(model='itww'),
    df_ensemble %>% mutate(model='ensemble')
) %>%
    filter(date <= ymd('2021-09-15'), date >= ymd('2020-10-12'))
forecast_dates <- all_df$date %>% unique

tikz/regional_forecasts_comparison.tex

last_date_KIT
# larger figures
options(
    repr.plot.width=16/1.5,
    repr.plot.height=12/1.5
)
kit_ecdc_switch <- tibble(
    model="ensemble",
    last_date=last_date_KIT + 3.5,
    text="switch from KIT to ECDC ensemble"
)

ggplot(all_df, aes(date)) +
    geom_line(aes(y=`50.0 %`, color=model, alpha="median")) +
    geom_line(aes(x = date, y = cases, linetype = "RKI"), data = filter(full_truth, date <= max(all_df$date), date >= min(all_df$date))) +
    geom_ribbon(aes(ymin = `25.0 %`, ymax = `75.0 %`, fill = model, alpha="$50\\%$")) +
    geom_ribbon(aes(ymin = `2.5 %`, ymax = `97.5 %`, fill = model, alpha="$95\\%$")) +
    geom_vline(data=kit_ecdc_switch, aes(xintercept=last_date), linetype=1) +
    geom_text(data=kit_ecdc_switch, aes(x = last_date + 48.5, y=3e5, label=text)) +
    labs(x = "", y = "", linetype = NULL, fill = NULL, alpha="") +
    guides(fill="none", color="none") +
    scale_x_four_weekly(week_start = 6) +
    scale_linetype_manual(values=c("RKI" = 2))+
    scale_alpha_manual(values = c("median" = 1, "$50\\%$" = 0.3, "$95\\%$" = 0.1)) +
    facet_wrap(~model, ncol = 1)  +
    scale_y_log10()

ggsave_tikz(
    here("tikz/regional_forecasts_comparison.tex"),
    width=16/1.5, height=12/1.5
)
pdf: 2

tables/regional_forecasts_combined_performance.tex

binom_test_ci <- function(k, n, digits=2) {
    ci <- binom.test(k, n)$conf.int
    paste0(format(k/n * 100, digits=digits), "\\% (", format(ci[1] * 100, digits=digits), "\\% - ", format(ci[2] * 100, digits=digits), "\\%)")
}
quantiles <- c(0.01, 0.025, 0.05, 0.10, 0.15, 0.2, 0.25, 0.3, 0.35, 0.45, 0.5, 0.55, 0.6, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 0.975, 0.99)
compute_relative_performance <- function(truth) {
  truth %>% 
    inner_join(all_df, by="date") %>%
    rename(truth=cases) %>%
    rowwise() %>%
    mutate(
        WIS_decompose = list(WIS_decompose(
            prob=quantiles,
            quant=c(`1.0 %`, `2.5 %`, `5.0 %`, `10.0 %`, `15.0 %`, `20.0 %`, `25.0 %`, `30.0 %`, `35.0 %`, `40.0 %`, `45.0 %`, `50.0 %`, `55.0 %`, `60.0 %`, `65.0 %`, `70.0 %`, `75.0 %`, `80.0 %`, `85.0 %`, `90.0 %`, `95.0 %`, `97.5 %`, `99.0 %`),
            actual= truth
            ))
    ) %>%
    unnest_wider(WIS_decompose) %>%
    group_by(model) %>%
    summarize(
        mae = mean(abs(`50.0 %` - truth), na.rm = TRUE),
        WIS = mean(wis),
        sharpness = mean(sharpness),
        underprediction = mean(underprediction),
        overprediction = mean(overprediction),
        mean_error = mean(mean_error),
        length_95 = mean(`97.5 %` - `2.5 %`, na.rm = TRUE),
        length_50 = mean(`75.0 %` - `25.0 %`, na.rm = TRUE),
        coverage_50 = mean(`25.0 %` <= truth & truth <= `75.0 %`, na.rm = TRUE),
        coverage_95 = mean(`2.5 %` <= truth & truth <= `97.5 %`, na.rm = TRUE),
    ) 
}

df_table_performance <- compute_relative_performance(full_truth) %>%
  mutate(coverage_50 = coverage_50 * 100) %>%
  mutate(coverage_95 = coverage_95 * 100) %>%
  rename(
    `MAE` = mae,
    `WIS` = WIS,
    `sharpness` = sharpness,
    `underprediction` = underprediction,
    `overprediction` = overprediction,
    `rescaled MAE` = mean_error,
    `coverage 50% PI [%]` = coverage_50,
    `coverage 95% PI [%]` = coverage_95,
  ) %>%
  select(-length_95, -length_50) %>%
  t() %>%  # Transpose
  as.data.frame() %>%  # Convert to data frame
  rownames_to_column(var = "metric") %>%
  setNames(c("metric", .[1,2:5])) %>%
  slice(-1)
df_table_relative_performance <- df_table_performance %>%
    mutate(across(baseline:ssm, as.numeric)) %>%
    # relative to baseline, first column
    mutate(across(ensemble:ssm, ~ .x / baseline)) %>%
    select(-baseline) %>%
    filter(metric %in% c('MAE', 'WIS'))
# function that formats numbers for LaTeX
format_latex <- function(x) {
  x <- round(as.numeric(x), 2)
  x <- format(x, nsmall = 2, big.mark = ",", scientific = FALSE)
  return(x)
}
library(kableExtra)

# Add baseline column to relative metrics (filled with 1.000)
df_table_relative_performance$baseline <- 1.000

# Reorder columns to match the absolute table structure
df_table_relative_performance <- df_table_relative_performance[, c("metric", "baseline", "ensemble", "itww", "ssm")]

# Combine the datasets
combined_table <- rbind(df_table_performance, df_table_relative_performance)

# Create the combined table with section headers
combined_table %>%
    # round to two digits
    mutate(across(baseline:ssm, format_latex)) %>%
    # add thousands separator in latex
    kable(format="latex", digits=1, booktabs=T, escape=T, align = c("l", "r", "r", "r", "r")) %>%
    #pack_rows("absolute performance", 1, nrow(df_table_performance) - 2) %>%
    pack_rows("WIS components", nrow(df_table_performance) - 5, nrow(df_table_performance)) %>%
    pack_rows("coverage", nrow(df_table_performance) - 1, nrow(df_table_performance)) %>%
    pack_rows("relative to baseline", nrow(df_table_performance) + 1, nrow(combined_table)) %>%
    cat(., file=here("tables/regional_forecasts_combined_metrics.tex"))
combined_table
A data.frame: 10 × 5
metric baseline ensemble itww ssm
<chr> <chr> <chr> <chr> <chr>
MAE 14654.00 9750.84 16531.37 10149.94
WIS 8707.215 5595.531 12708.904 6795.430
sharpness 3023.821 2459.093 890.011 5074.880
underprediction 3239.0455 776.0869 1293.9369 832.6676
overprediction 1753.8314 1934.3247 9789.5615 448.6346
rescaled MAE 690.5170 426.0267 735.3950 439.2476
coverage 50% PI [%] 37.50 43.75 18.75 75.00
coverage 95% PI [%] 91.66667 91.66667 41.66667 100.00000
MAE 1 0.665404667667531 1.1281131431691 0.692639552340658
WIS 1 0.642631541773116 1.45958311584129 0.780436683830593